In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import plotly as py
py.offline.init_notebook_mode()
#import colorlover as cl
from IPython.display import HTML, display
In [2]:
sns.set_context("poster")
sns.set_style("ticks")
In [3]:
TOPIC_MAPPING={
"GunControl": "Gun Control",
"Privacy": "Privacy",
"Vaccine": "Vaccine",
"ChildEducation": "Child Education",
"SkinDamage": "Skin Damage",
"SeatBelt": "Seat Belt"
}
topic_order=["Gun Control", "Privacy", "Vaccine",
"Child Education", "Skin Damage", "Seat Belt"]
df = pd.read_hdf("FINAL_ANALYSIS_DATA.h5", "final_data").rename(columns={
#u'is_controvertial': u'is_controversial'
}).assign(
topic_name=lambda x: x.topic_name.apply(lambda k: TOPIC_MAPPING[k.split('/')[0]]),
)
NON_STATES = set(["UNK", "USA", "AS", "DC", "GU",
"MP", "PR", "VI"])
In [4]:
df.columns
Out[4]:
In [5]:
df.CATS.fillna(0).apply(
lambda x: Counter(['UNK'])
if x == 0
else Counter(x)
).apply(lambda x: len(x)).describe()
Out[5]:
In [6]:
df["CATS_Counter"] = df.CATS.fillna(0).apply(
lambda x: Counter(['NONE'])
if x == 0
else Counter(x)
)
df.ix[df.CATS_Counter.apply(lambda x: len(x)) == 2, "CATS_Counter"].head()
Out[6]:
In [7]:
def get_string(x, cols):
return "<br>".join("%s: %s" % (k.title(), x[k])
for k in cols)
In [8]:
scl = [[0.0, 'rgb(242,240,247)'],[0.2, 'rgb(218,218,235)'],[0.4, 'rgb(188,189,220)'],\
[0.6, 'rgb(158,154,200)'],[0.8, 'rgb(117,107,177)'],[1.0, 'rgb(84,39,143)']]
In [9]:
df[df.u_state != "USA"].groupby("u_state")["is_controversial"].agg([np.mean, len, np.std]).reset_index()
Out[9]:
In [10]:
def get_string(x):
return "<br>".join("%s: %s" % (k.title(), x[k])
for k in x.index)
def plot_map(df, location_col, value_col, text_cols,
scl="Portland", title="", cbar_title=""):
data = [ dict(
type='choropleth',
colorscale = scl,
autocolorscale = False,
locations = df[location_col],
z = df[value_col].astype(float),
locationmode = 'USA-states',
text = df[text_cols].apply(get_string, axis=1),
marker = dict(
line = dict (
color = 'rgb(255,255,255)',
width = 2
) ),
colorbar = dict(
title = cbar_title)
) ]
layout = dict(
title = '%s<br>(Hover for details)' % title,
geo = dict(
scope='usa',
projection=dict( type='albers usa' ),
showlakes = False),
)
fig = dict( data=data, layout=layout )
py.offline.iplot(fig, filename='d3-cloropleth-map')
In [11]:
scl = [[0.0, 'rgb(242,240,247)'],[0.2, 'rgb(218,218,235)'],[0.4, 'rgb(188,189,220)'],\
[0.6, 'rgb(158,154,200)'],[0.8, 'rgb(117,107,177)'],[1.0, 'rgb(84,39,143)']]
df_t = df[df.u_state != "USA"].groupby("u_state")["is_controversial"].agg([np.mean, len, np.std]).reset_index()
plot_map(df_t,
"u_state", "mean", ["u_state","len", "std"], scl='Portland',
title="Proportion of controversial tweets per state",
cbar_title="Proportion"
)
In [12]:
df_t = df.assign(
fakenews=df.CATS_Counter.apply(lambda x: x.get('fakenews', 0))
)[["u_state", "fakenews"]].groupby("u_state")["fakenews"].agg([np.mean, len, np.std]).reset_index()
plot_map(df_t,
"u_state", "mean", ["u_state","len", "std"], scl='Portland',
title="Proportion of fakenews urls per state",
cbar_title="Proportion"
)
In [13]:
for url_type in ["fakenews", "news", "blog"]:
df_t = df[(df.u_state != "USA")
& (df.t_n_urls > 0)].assign(**{
url_type: lambda x: x.CATS_Counter.apply(lambda k: k.get(url_type, 0))}
)[["u_state", url_type]].groupby("u_state")[url_type].agg([np.mean, len, np.std]).reset_index()
plot_map(df_t[
#(df_t["len"] > (df_t["len"].sum() * 0.01))
(df_t["len"] >= (df_t["len"].sort_values().values[-10]))
& (~df_t["u_state"].isin(NON_STATES))
],
"u_state", "mean", ["u_state","len", "std"], scl='Portland',
title="Proportion of %s urls (in tweets with URLs) per state" % url_type.title(),
cbar_title="Proportion"
)
In [14]:
df.topic_name.value_counts()
Out[14]:
In [15]:
scl = [[0.0, 'rgb(242,240,247)'],[0.2, 'rgb(218,218,235)'],[0.4, 'rgb(188,189,220)'],\
[0.6, 'rgb(158,154,200)'],[0.8, 'rgb(117,107,177)'],[1.0, 'rgb(84,39,143)']]
url_type = "fakenews"
for topic in topic_order:
df_t = df[(df.u_state != "USA")
& (df.t_n_urls > 0)
& (df.topic_name == topic)
].assign(
fakenews=lambda x: x.CATS_Counter.apply(lambda k: k.get(url_type, 0))
)[["u_state", url_type]].groupby("u_state")[url_type].agg([np.mean, len, np.std]).reset_index()
df_t["value_rank"] = df_t["mean"].rank(ascending=False)
plot_map(df_t[
#(df_t["len"] > (df_t["len"].sum() * 0.01))
(df_t["len"] >= (df_t["len"].sort_values().values[-10]))
& (~df_t["u_state"].isin(NON_STATES))
],
"u_state", "mean", ["u_state", "value_rank", "mean","len", "std"], scl="Portland",
title=topic,
cbar_title="Proportion"
)
In [16]:
scl = [[0.0, 'rgb(242,240,247)'],[0.2, 'rgb(218,218,235)'],[0.4, 'rgb(188,189,220)'],\
[0.6, 'rgb(158,154,200)'],[0.8, 'rgb(117,107,177)'],[1.0, 'rgb(84,39,143)']]
url_type = "blog"
for topic in topic_order:
df_t = df[(df.u_state != "USA")
& (df.t_n_urls > 0)
& (df.topic_name == topic)
].assign(**{
url_type: lambda x: x.CATS_Counter.apply(lambda k: k.get(url_type, 0))
})[["u_state", url_type]].groupby("u_state")[url_type].agg([np.mean, len, np.std]).reset_index()
df_t["value_rank"] = df_t["mean"].rank(ascending=False)
plot_map(df_t[
#(df_t["len"] > (df_t["len"].sum() * 0.01))
(df_t["len"] >= (df_t["len"].sort_values().values[-10]))
& (~df_t["u_state"].isin(NON_STATES))
],
"u_state", "mean", ["u_state", "value_rank", "mean","len", "std"], scl="Portland",
title=topic,
cbar_title="Proportion"
)
In [17]:
scl = [[0.0, 'rgb(242,240,247)'],[0.2, 'rgb(218,218,235)'],[0.4, 'rgb(188,189,220)'],\
[0.6, 'rgb(158,154,200)'],[0.8, 'rgb(117,107,177)'],[1.0, 'rgb(84,39,143)']]
url_type = "news"
for topic in topic_order:
df_t = df[(df.u_state != "USA")
& (df.t_n_urls > 0)
& (df.topic_name == topic)
].assign(**{
url_type: lambda x: x.CATS_Counter.apply(lambda k: k.get(url_type, 0))
})[["u_state", url_type]].groupby("u_state")[url_type].agg([np.mean, len, np.std]).reset_index()
df_t["value_rank"] = df_t["mean"].rank(ascending=False)
plot_map(df_t[
#(df_t["len"] > (df_t["len"].sum() * 0.01))
(df_t["len"] >= (df_t["len"].sort_values().values[-10]))
& (~df_t["u_state"].isin(NON_STATES))
],
"u_state", "mean", ["u_state", "value_rank", "mean","len", "std"], scl="Portland",
title=topic,
cbar_title="Proportion"
)
In [18]:
def plot_map_subplots(df, geo_key, topic, location_col, value_col, text_cols,
scl="Portland", cbar_title=""):
data = [dict(
type='choropleth',
colorscale = scl,
geo=geo_key,
autocolorscale = False,
showscale = False,
locations = df[location_col],
z = df[value_col].astype(float),
locationmode = 'USA-states',
text = df[text_cols].apply(get_string, axis=1),
marker = dict(
line = dict (
color = 'rgb(255,255,255)',
width = 2
) ),
#colorbar = dict(
# title = cbar_title)
),
dict(
type = 'scattergeo',
showlegend = False,
lon = [-82],
lat = [50],
geo = geo_key,
text = [topic],
mode = 'text',
)
]
layout = dict(
scope='usa',
projection=dict( type='albers usa' ),
showlakes = False,
domain=dict(x=[], y=[])
)
return data, layout
In [19]:
data = []
COLS = 3
ROWS = 2
url_type="fakenews"
layout = dict(
title = '%s URL proportions per state' % url_type.title(),
# showlegend = False,
autosize = False,
width = 900,
height = 400,
hovermode = False,)
for i, topic in enumerate(topic_order):
geo_key="geo%s" % (i+1) if i != 0 else "geo" # Important to index geo with i+1 rather than i
x = i % COLS
y = i / COLS
df_t = df[(df.u_state != "USA")
& (df.t_n_urls > 0)
& (df.topic_name == topic)
].assign(**{
url_type: lambda x: x.CATS_Counter.apply(lambda k: k.get(url_type, 0))}
)[["u_state", url_type]].groupby("u_state")[url_type].agg([np.mean, len, np.std]).reset_index()
data_t, layout_t = plot_map_subplots(
df_t[(df_t["len"] >= (df_t["len"].sort_values().values[-10]))
& (~df_t["u_state"].isin(NON_STATES))
], geo_key, topic,
"u_state", "mean", ["u_state","len", "std"], scl='Portland',
cbar_title="Proportion"
)
data.extend(data_t)
layout[geo_key] = layout_t
layout[geo_key]['domain']['x'] = [float(x)/float(COLS), float(x+1)/float(COLS)]
layout[geo_key]['domain']['y'] = [float(y)/float(ROWS), float(y+1)/float(ROWS)]
print geo_key, x, y, layout[geo_key]["domain"]
fig = dict(data=data, layout=layout)
py.offline.iplot(fig, filename='d3-cloropleth-map')
In [20]:
data = []
COLS = 3
ROWS = 2
url_type="blog"
layout = dict(
title = '%s URL proportions per state' % url_type.title(),
# showlegend = False,
autosize = False,
width = 900,
height = 400,
hovermode = False,)
for i, topic in enumerate(topic_order):
geo_key="geo%s" % (i+1) if i != 0 else "geo" # Important to index geo with i+1 rather than i
x = i % COLS
y = i / COLS
df_t = df[(df.u_state != "USA")
& (df.t_n_urls > 0)
& (df.topic_name == topic)
].assign(**{
url_type: lambda x: x.CATS_Counter.apply(lambda k: k.get(url_type, 0))}
)[["u_state", url_type]].groupby("u_state")[url_type].agg([np.mean, len, np.std]).reset_index()
data_t, layout_t = plot_map_subplots(
df_t[(df_t["len"] >= (df_t["len"].sort_values().values[-10]))
& (~df_t["u_state"].isin(NON_STATES))
], geo_key, topic,
"u_state", "mean", ["u_state","len", "std"], scl='Portland',
cbar_title="Proportion"
)
data.extend(data_t)
layout[geo_key] = layout_t
layout[geo_key]['domain']['x'] = [float(x)/float(COLS), float(x+1)/float(COLS)]
layout[geo_key]['domain']['y'] = [float(y)/float(ROWS), float(y+1)/float(ROWS)]
print geo_key, x, y, layout[geo_key]["domain"]
fig = dict(data=data, layout=layout)
py.offline.iplot(fig, filename='d3-cloropleth-map')
In [21]:
data = []
COLS = 3
ROWS = 2
url_type="news"
layout = dict(
title = '%s URL proportions per state' % url_type.title(),
# showlegend = False,
autosize = False,
width = 900,
height = 400,
hovermode = False,)
for i, topic in enumerate(topic_order):
geo_key="geo%s" % (i+1) if i != 0 else "geo" # Important to index geo with i+1 rather than i
x = i % COLS
y = i / COLS
df_t = df[(df.u_state != "USA")
& (df.t_n_urls > 0)
& (df.topic_name == topic)
].assign(**{
url_type: lambda x: x.CATS_Counter.apply(lambda k: k.get(url_type, 0))}
)[["u_state", url_type]].groupby("u_state")[url_type].agg([np.mean, len, np.std]).reset_index()
data_t, layout_t = plot_map_subplots(
df_t[(df_t["len"] >= (df_t["len"].sort_values().values[-10]))
& (~df_t["u_state"].isin(NON_STATES))
], geo_key, topic,
"u_state", "mean", ["u_state","len", "std"], scl='Portland',
cbar_title="Proportion"
)
data.extend(data_t)
layout[geo_key] = layout_t
layout[geo_key]['domain']['x'] = [float(x)/float(COLS), float(x+1)/float(COLS)]
layout[geo_key]['domain']['y'] = [float(y)/float(ROWS), float(y+1)/float(ROWS)]
print geo_key, x, y, layout[geo_key]["domain"]
fig = dict(data=data, layout=layout)
py.offline.iplot(fig, filename='d3-cloropleth-map')
In [22]:
df_topics = {}
for topic in topic_order:
df_t = df[(df.u_state != "USA")
& (df.t_n_urls > 0)
& (df.topic_name == topic)
].assign(
fakenews=lambda x: x.CATS_Counter.apply(lambda k: k.get('fakenews', 0))
)[["u_state", "fakenews"]].groupby("u_state")["fakenews"].agg([np.mean, len, np.std]).reset_index()
df_t["value_rank"] = df_t["mean"].rank(ascending=False)
df_topics[topic] = (df_t[
(df_t["len"] >= (df_t["len"].sort_values().values[-10]))
#(df_t["len"] > (df_t["len"].sum() * 0.01)
].sort_values("mean",
ascending=False).reset_index().apply(
lambda x: "%s (%.2f) [%s]" % (
x["u_state"], x["mean"], x["len"]), axis=1))
pd.concat(df_topics, axis=1, keys=topic_order)
Out[22]:
In [23]:
fig, ax = plt.subplots(1,1,figsize=(15,5))
with sns.plotting_context(
rc={"axes.titlesize": 14,
"axes.labelsize": 14,
"xtick.labelsize": 12,
"ytick.labelsize": 14,
}), sns.axes_style(
rc={"font.family": "monospace"}):
g = sns.barplot(y="is_controversial", x="u_state",
errwidth=2,
data=df[~df.u_state.isin(NON_STATES)].sort_values("u_state"),
ax=ax, color="0.7")
ax.axhline(y=0.5, linestyle='--', color="k", lw=1.)
ax.set_ylabel("Proportion of controversial tweets")
ax.set_xlabel("US States")
#ax.tick_params(axis='x', which='major', labelsize=10)
sns.despine(offset=10)
In [24]:
LOCATION_ORDER = (["UNK", "USA"] + sorted(set(
df.u_state.fillna("UNK").value_counts().index
) - NON_STATES)+ sorted(["AS", "DC", "GU",
"MP", "PR", "VI"]))
colors = ["b"] * 2 + ["r"]*50 +["0.7"]*6
fig, ax = plt.subplots(1,1,figsize=(16,5))
with sns.plotting_context(
rc={"axes.titlesize": 14,
"axes.labelsize": 14,
"xtick.labelsize": 12,
"ytick.labelsize": 14,
}), sns.axes_style(
rc={"font.family": "monospace"}):
g = sns.barplot(y="is_controversial", x="u_state",
errwidth=2,
data=df.assign(u_state=df.u_state.fillna("UNK")),
ax=ax, color="r", order=LOCATION_ORDER)
ax.axhline(y=0.5, linestyle='--', color="k", lw=1.)
ax.set_ylabel("Proportion of controversial tweets")
ax.set_xlabel("US States")
#ax.tick_params(axis='x', which='major', labelsize=10)
[ax.patches[i].set_color(c) for i, c in enumerate(colors)]
sns.despine(offset=10)
plt.setp(ax.get_xticklabels()[:3], rotation=90)
In [25]:
LOCATION_ORDER = (["UNK", "USA"] + sorted(set(
df.u_state.fillna("UNK").value_counts().index
) - NON_STATES)+ sorted(["AS", "DC", "GU",
"MP", "PR", "VI"]))
colors = ["b"] * 2 + ["r"]*50 +["k"]*6
total_controversial = df[(df.is_controversial == 1) & (~df.u_state.isin(NON_STATES))].shape[0] * 1.
fig, ax = plt.subplots(1,1,figsize=(16,5))
with sns.plotting_context(
rc={"axes.titlesize": 14,
"axes.labelsize": 14,
"xtick.labelsize": 12,
"ytick.labelsize": 14,
}), sns.axes_style(
rc={"font.family": "monospace"}):
g = sns.barplot(y="is_controversial", x="u_state",
data=df[
(df.is_controversial == 1)
& (~df.u_state.isin(NON_STATES))
],
ax=ax, color="0.5",
order=LOCATION_ORDER[2:-6],
ci=None, estimator=lambda x: len(x)/total_controversial)
#ax.axhline(y=0.5, linestyle='--', color="k", lw=1.)
ax.set_ylabel("Distribution of controversial tweets\nacross states")
ax.set_xlabel("US States")
#ax.tick_params(axis='x', which='major', labelsize=10)
#[ax.patches[i].set_color(c) for i, c in enumerate(colors)]
sns.despine(offset=10)
#plt.setp(ax.get_xticklabels()[:3], rotation=90)
In [26]:
df_t = df[(~df.u_state.isin(NON_STATES)) & (~df.u_state.isnull())].pivot_table(
index="u_state", columns="topic_name", values="t_id", aggfunc=len)
with sns.plotting_context(
rc={"axes.titlesize": 10,
"axes.labelsize": 10,
"xtick.labelsize": 10,
"ytick.labelsize": 10,
}), sns.axes_style(
rc={"font.family": "monospace"}):
g = sns.PairGrid(df_t.divide(df_t.sum(axis=0), axis=1).reset_index(),
x_vars=topic_order, y_vars=["u_state"],
size=10, aspect=.25)
g.map(sns.stripplot, size=10, orient="h",
color="k", edgecolor="gray")
# Use the same x axis limits on all columns and add better labels
g.set(xlabel="proportion", ylabel="",)
# Use semantically meaningful titles for the columns
titles = topic_order
for ax, title in zip(g.axes.flat, titles):
# Set a different title for each axes
ax.set(title=title)
# Make the grid horizontal instead of vertical
ax.xaxis.grid(False)
ax.yaxis.grid(True)
sns.despine(left=True, bottom=True)
# Draw a dot plot using the stripplot function
In [27]:
LOCATION_ORDER = (["UNK", "USA"] + sorted(set(
df.u_state.fillna("UNK").value_counts().index
) - NON_STATES)+ sorted(["AS", "DC", "GU",
"MP", "PR", "VI"]))
colors = ["b"] * 2 + ["r"]*50 +["0.7"]*6
with sns.plotting_context(
rc={"axes.titlesize": 14,
"axes.labelsize": 14,
"xtick.labelsize": 12,
"ytick.labelsize": 14,
}), sns.axes_style(
rc={"font.family": "monospace"}):
fig, ax = plt.subplots(1,1, figsize=(20,5))
ax = sns.countplot(df.u_state.fillna("UNK"), color='k', ax=ax,
order=LOCATION_ORDER)
ax.set_yscale('log')
ax.set_ylabel('Frequency')
ax.set_xlabel('Tweet author location')
plt.xticks(rotation='vertical')
#sns.despine(offset=2)
[ax.patches[i].set_color(c) for i, c in enumerate(colors)]
In [28]:
pd.concat([pd.DataFrame(k.reset_index().values, columns=["Location", "Counts"])
for k in np.array_split(df.u_state.fillna("UNK").value_counts(), 4, axis=0)], axis=1)
Out[28]:
In [29]:
df.u_state.describe()
Out[29]:
In [30]:
df.u_state.shape
Out[30]:
In [31]:
df.groupby("u_id")["u_state"].first().shape, df.groupby("u_id")["u_state"].first().describe()
Out[31]:
In [ ]: